# Produces Figure 2, Tables 2, 3 in section 3 and Tables A3, A4 in the appendix


# Libraries
library(tidyverse)
library(ggplot2)
library(ggrepel)
library(ggpmisc)
library(broom)
library(data.table) 
library(dplyr)
library(msm)
library(plm)



wkdir <- "/Users/huiyuli/Dropbox/Entry Technology/JPE macro submission/Replication Script"


#################### Figure 2 #################### 
# Import data on state real gsp, number of firms, and employment
plotyear = 2020
data_gsp <- fread(paste(wkdir, "raw data/realStateGDP.csv",  sep="/")) %>% 
  rename(real_gsp = rgdp_haver)
data_gsp <- mutate(data_gsp, state_abbrev = toupper(state_abbrev))[grep(plotyear, year)]
data_fips <- fread(paste(wkdir, "raw data/stateFips.csv",  sep="/"))
data_gsp <- merge(data_gsp, data_fips, by.x = "state_abbrev", by.y = "stusps", all.x = TRUE, all.y = TRUE)
data_bds <- fread(paste(wkdir, "raw data/bds2020_st.csv",  sep="/"),  select = c("year", "firms", "emp",'st'))[grep(2020, year)][,c("st", "firms", "emp") ]
data_gsp_bds <- merge(data_gsp, data_bds, by.x = "st", by.y = "st", all.x = TRUE, all.y = TRUE)
data_gsp_bds <-mutate(data_gsp_bds, ln_GSP_worker = log(real_gsp / emp),
       ln_emp_per_firm = log(emp / firms),
       ln_emp = log(emp),
       ln_firms = log(firms))

# Figure 2 (a). Regress the log firms on log emp 
eq <- lm(ln_firms ~ ln_emp, data = data_gsp_bds)
tidy_eq <- tidy(eq)
regtext <- paste(paste0('y = ', round(summary(eq)$coefficients[2, 1], digits = 2), 'x + ', round(summary(eq)$coefficients[1,1], digits = 2)),
             paste0('SE: ', round(summary(eq)$coefficients[2,2], digits = 2)),
             paste0('R-squared: ', round(summary(eq)$r.squared, digits = 2)),
             sep = '\n')
regtext <- ''   # comment out to add regression results to the graph
state_firms_emp <- ggplot(data_gsp_bds, aes(x = ln_emp, y = ln_firms)) +
  geom_point() +
  stat_poly_line(se = FALSE) +
  theme(aspect.ratio=1) +
  #coord_fixed(ratio=1) +
  geom_text(data = data.frame(annotateText = regtext),
            aes(x = -Inf, y = Inf, hjust = 0, vjust = 1.25, label = annotateText), size = 5) +
      geom_text_repel(aes(label = state_abbrev), max.overlaps = 75, min.segment.length=0) +
      scale_x_continuous(breaks = seq(12, 17, 1), labels = seq(12, 17, 1), limits = c(12, 17)) +
      scale_y_continuous(breaks = seq(9, 14, 1), labels = seq(9, 14, 1), limits = c(9, 14)) +
      theme_bw() +
  #labs(caption = 'Source: BDS') +
  ylab("log Firms") +
  xlab("log Employment") + 
  theme(axis.title = element_text(size = 18))
ggsave(paste(wkdir, "output/figures/figure2a.png",  sep="/"), plot = state_firms_emp, device = "png")


# Figure 2 (b). Regress the log emp per firm on log real gdp per emp
eq <- lm( ln_emp_per_firm ~ ln_GSP_worker, data = data_gsp_bds)
tidy_eq <- tidy(eq)
regtext <- paste(paste0('y = ', round(summary(eq)$coefficients[2, 1], digits = 2), 'x + ', round(summary(eq)$coefficients[1,1], digits = 2)),
                 paste0('SE: ', round(summary(eq)$coefficients[2,2], digits = 2)),
                 paste0('R-squared: ', round(summary(eq)$r.squared, digits = 2)),
                 sep = '\n')
regtext <- ''   # comment out to add regression results to the graph
state_empfirms_prod  <- ggplot(data_gsp_bds, aes(x = ln_GSP_worker, y = ln_emp_per_firm)) +
  geom_point() +
  stat_poly_line(se = FALSE) +
  #coord_fixed(ratio=1)+
  theme(aspect.ratio=1) +
  geom_text(data = data.frame(annotateText = regtext),
            aes(x = -Inf, y = Inf, hjust = 0, vjust = 1.25, label = annotateText), size = 5) +
                 geom_text_repel(aes(label = state_abbrev), max.overlaps = 75, min.segment.length=0) +
                 scale_x_continuous(breaks = seq(-3, -1, 0.5), labels = seq(-3, -1, 0.5), limits = c(-3, -1)) +
                 scale_y_continuous(breaks = seq(2, 4, 0.5), labels = seq(2, 4, 0.5), limits = c(2, 4)) +
            theme_bw() +
  #labs(caption = 'Source: BDS') +
  ylab("log Employment per firm") +
  xlab("log GSP per worker") + 
  theme(axis.title = element_text(size = 18))
ggsave(paste(wkdir, "output/figures/figure2b.png", sep="/"), plot = state_empfirms_prod, device = "png")


##################### Prepare data for Table 2, 3  #################### 
data_gsp <- fread(paste(wkdir, "raw data/realStateGDP.csv",  sep="/")) %>% 
  rename(real_gsp = rgdp_haver)
data_gsp <- mutate(data_gsp, state_abbrev = toupper(state_abbrev))
data_fips <- fread(paste(wkdir, "raw data/stateFips.csv",  sep="/"))
data_gsp <- merge(data_gsp, data_fips, by.x = "state_abbrev", by.y = "stusps", all.x = TRUE, all.y = TRUE)
data_bds <- fread(paste(wkdir, "raw data/bds2020_st.csv",  sep="/"),  select = c("year", "firms", "emp",'st'))
data_gsp_bds <- merge(data_gsp, data_bds, by.x = c("st", "year"), by.y = c("st","year"), all.x = FALSE, all.y = TRUE)
data_bds_entrant <- fread(paste(wkdir, "raw data/bds2020_st_fa.csv",  sep="/"),  
                          select = c("year", "st", "fage", "firms", "emp"))[grep('a) 0', fage)][, c("year", "st", "firms", "emp")]
data_bds_entrant <- as.data.frame(sapply(data_bds_entrant, as.numeric)) 
data_bds_entrant <- mutate(data_bds_entrant, ln_emp_per_firm_entrant = log(emp / firms))
data_gsp_bds <- merge(data_gsp_bds, data_bds_entrant[, c("year", "st", "ln_emp_per_firm_entrant")], by.x = c("st","year"), by.y = c("st","year"), all.x = TRUE, all.y = FALSE)
data_gsp_bds <-mutate(data_gsp_bds, ln_GSP_worker = log(real_gsp / emp),
                      ln_emp_per_firm = log(emp / firms),
                      ln_emp = log(emp),
                      ln_firms = log(firms))

data_gsp_bds <- data_gsp_bds[ data_gsp_bds$state_abbrev != "DC",]
data_gsp_bds$ln_firms_lag <- dplyr::lag(data_gsp_bds$ln_firms, n=1, order_by = cbind( data_gsp_bds$st, data_gsp_bds$year))
firstyear = min(data_gsp_bds$year)
data_gsp_bds[data_gsp_bds$year==firstyear, c("ln_firms_lag")]<- NA

# calculate 1 year change in the variables
h = 1
data_gsp_bds$ln_firms_lag_lagged1 <- dplyr::lag(data_gsp_bds$ln_firms_lag, n=h, order_by = cbind( data_gsp_bds$st, data_gsp_bds$year))
data_gsp_bds$ln_emp_lagged1 <- dplyr::lag(data_gsp_bds$ln_emp, n=h, order_by = cbind( data_gsp_bds$st, data_gsp_bds$year))
data_gsp_bds$ln_emp_per_firm_lagged1 <- dplyr::lag(data_gsp_bds$ln_emp_per_firm, n=h, order_by = cbind( data_gsp_bds$st, data_gsp_bds$year))
data_gsp_bds$ln_emp_per_firm_entrant_lagged1 <- dplyr::lag(data_gsp_bds$ln_emp_per_firm_entrant, n=h, order_by = cbind( data_gsp_bds$st, data_gsp_bds$year))
data_gsp_bds$ln_GSP_worker_lagged1 <- dplyr::lag(data_gsp_bds$ln_GSP_worker, n=h, order_by = cbind( data_gsp_bds$st, data_gsp_bds$year))
data_gsp_bds[data_gsp_bds$year==firstyear+h-1, c("ln_firms_lag_lagged1", "ln_emp_lagged1","ln_emp_per_firm_lagged1","ln_emp_per_firm_entrant_lagged1","ln_GSP_worker_lagged1")]<- NA

# 1 year change in the regression variables
data_gsp_bds$d_ln_firms_lag_1 = data_gsp_bds$ln_firms_lag - data_gsp_bds$ln_firms_lag_lagged1
data_gsp_bds$d_ln_emp_1 = data_gsp_bds$ln_emp - data_gsp_bds$ln_emp_lagged1
data_gsp_bds$d_ln_emp_per_firm_1 = data_gsp_bds$ln_emp_per_firm -  data_gsp_bds$ln_emp_per_firm_lagged1 
data_gsp_bds$d_ln_emp_per_firm_entrant_1 = data_gsp_bds$ln_emp_per_firm_entrant -  data_gsp_bds$ln_emp_per_firm_entrant_lagged1 
data_gsp_bds$d_ln_GSP_worker_1 = data_gsp_bds$ln_GSP_worker - data_gsp_bds$ln_GSP_worker_lagged1


# 41 year lag
h = 41
data_gsp_bds$ln_firms_lag_lagged41 <- dplyr::lag(data_gsp_bds$ln_firms_lag, n=h, order_by = cbind( data_gsp_bds$st, data_gsp_bds$year))
data_gsp_bds$ln_emp_lagged41 <- dplyr::lag(data_gsp_bds$ln_emp, n=h, order_by = cbind( data_gsp_bds$st, data_gsp_bds$year))
data_gsp_bds$ln_emp_per_firm_lagged41 <- dplyr::lag(data_gsp_bds$ln_emp_per_firm, n=h, order_by = cbind( data_gsp_bds$st, data_gsp_bds$year))
data_gsp_bds$ln_emp_per_firm_entrant_lagged41 <- dplyr::lag(data_gsp_bds$ln_emp_per_firm_entrant, n=h, order_by = cbind( data_gsp_bds$st, data_gsp_bds$year))
data_gsp_bds$ln_GSP_worker_lagged41 <- dplyr::lag(data_gsp_bds$ln_GSP_worker, n=h, order_by = cbind( data_gsp_bds$st, data_gsp_bds$year))
data_gsp_bds[data_gsp_bds$year<=firstyear+h-1, c("ln_firms_lag_lagged41","ln_emp_lagged41","ln_emp_per_firm_lagged41","ln_emp_per_firm_entrant_lagged41","ln_GSP_worker_lagged41")]<- NA

# 41 year change in the regression variables
data_gsp_bds$d_ln_firms_lag_41 = data_gsp_bds$ln_firms_lag - data_gsp_bds$ln_firms_lag_lagged41
data_gsp_bds$d_ln_emp_41 = data_gsp_bds$ln_emp - data_gsp_bds$ln_emp_lagged41
data_gsp_bds$d_ln_emp_per_firm_41 = data_gsp_bds$ln_emp_per_firm -  data_gsp_bds$ln_emp_per_firm_lagged41 
data_gsp_bds$d_ln_emp_per_firm_entrant_41 = data_gsp_bds$ln_emp_per_firm_entrant -  data_gsp_bds$ln_emp_per_firm_entrant_lagged41 
data_gsp_bds$d_ln_GSP_worker_41 = data_gsp_bds$ln_GSP_worker - data_gsp_bds$ln_GSP_worker_lagged41
View(data_gsp_bds)

#####################  Table 2 #################### 
sigma = 4 
# Col 1
eq_col <- lm( d_ln_emp_per_firm_41 ~ d_ln_GSP_worker_41, data = data_gsp_bds)
summary(eq_col)
coef_prod = summary(eq_col)$coefficients["d_ln_GSP_worker_41", "Estimate"]
lambda = round(coef_prod+1, digits= 3)
se_lambda = round(summary(eq_col)$coefficients["d_ln_GSP_worker_41", "Std. Error"], digits = 3)
phi = 0
se_phi = 0
r2 = round(summary(eq_col)$r.squared, digits = 3)
amp = -coef_prod/(sigma-1+coef_prod)*100
amp_se = deltamethod(~ -x2/(sigma-1+x2)*100, coef(eq_col), vcov(eq_col))
table.col1 <- c(lambda,se_lambda,phi,se_phi,r2,nobs(eq_col),amp,amp_se)

# Col 2
eq_col <- lm(d_ln_emp_per_firm_41 ~ d_ln_GSP_worker_41 + d_ln_firms_lag_41, data = data_gsp_bds)
summary(eq_col)
coef_prod = summary(eq_col)$coefficients["d_ln_GSP_worker_41", "Estimate"]
coef_firm = summary(eq_col)$coefficients["d_ln_firms_lag_41", "Estimate"]
lambda = round(coef_prod+1, digits= 3)
se_lambda = round(summary(eq_col)$coefficients["d_ln_GSP_worker_41", "Std. Error"], digits = 3)
phi = round(-coef_firm, digits= 3)
se_phi = round(summary(eq_col)$coefficients["d_ln_firms_lag_41", "Std. Error"], digits = 3)
r2 = round(summary(eq_col)$r.squared, digits = 3)
amp = -coef_prod/(1+coef_firm)/(sigma-1+coef_prod/(1+coef_firm))*100
amp_se = deltamethod(~ -x2/(1+x3)/(sigma-1+x2/(1+x3))*100, coef(eq_col), vcov(eq_col))
table.col2 <- c(lambda,se_lambda,phi,se_phi,r2,nobs(eq_col),amp,amp_se)

# Col 3
eq_col <- lm( d_ln_emp_per_firm_1 ~ d_ln_GSP_worker_1, data = data_gsp_bds)
summary(eq_col)
coef_prod = summary(eq_col)$coefficients["d_ln_GSP_worker_1", "Estimate"]
lambda = round(coef_prod+1, digits= 3)
se_lambda = round(summary(eq_col)$coefficients["d_ln_GSP_worker_1", "Std. Error"], digits = 3)
phi = 0
se_phi = 0
r2 = round(summary(eq_col)$r.squared, digits = 3)
amp = -coef_prod/(sigma-1+coef_prod)*100
amp_se = deltamethod(~ -x2/(sigma-1+x2)*100, coef(eq_col), vcov(eq_col))
table.col3 <- c(lambda,se_lambda,phi,se_phi,r2,nobs(eq_col),amp,amp_se)

# Col 4
eq_col <- lm(d_ln_emp_per_firm_1 ~ d_ln_GSP_worker_1 + d_ln_firms_lag_1, data = data_gsp_bds)
summary(eq_col)
coef_prod = summary(eq_col)$coefficients["d_ln_GSP_worker_1", "Estimate"]
coef_firm = summary(eq_col)$coefficients["d_ln_firms_lag_1", "Estimate"]
lambda = round(coef_prod+1, digits= 3)
se_lambda = round(summary(eq_col)$coefficients["d_ln_GSP_worker_1", "Std. Error"], digits = 3)
phi = round(-coef_firm, digits= 3)
se_phi = round(summary(eq_col)$coefficients["d_ln_firms_lag_1", "Std. Error"], digits = 3)
r2 = round(summary(eq_col)$r.squared, digits = 3)
amp = -coef_prod/(1+coef_firm)/(sigma-1+coef_prod/(1+coef_firm))*100
amp_se = deltamethod(~ -x2/(1+x3)/(sigma-1+x2/(1+x3))*100, coef(eq_col), vcov(eq_col))
table.col4 <- c(lambda,se_lambda,phi,se_phi,r2,nobs(eq_col),amp,amp_se)

table <- cbind(table.col1, table.col2, table.col3, table.col4)
table <- round(table, digits=3)
rnames <- c("lambdar", "se", "phi", "se", "R2", "N", "amp", "se")
cnames <- c("41 years","41 years lagged N","1 years","1 years lagged N")
table2 <- matrix(table,nrow=8,byrow=FALSE,dimnames=list(rnames,cnames))
View(table2)

#####################  Table 3 #################### 
sigma = 4 
# Col 1
eq_col <- lm( d_ln_emp_per_firm_entrant_41 ~ d_ln_GSP_worker_41, data = data_gsp_bds)
summary(eq_col)
coef_prod = summary(eq_col)$coefficients["d_ln_GSP_worker_41", "Estimate"]
lambda = round(coef_prod+1, digits= 3)
se_lambda = round(summary(eq_col)$coefficients["d_ln_GSP_worker_41", "Std. Error"], digits = 3)
phi = 0
se_phi = 0
r2 = round(summary(eq_col)$r.squared, digits = 3)
amp = -coef_prod/(sigma-1+coef_prod)*100
amp_se = deltamethod(~ -x2/(sigma-1+x2)*100, coef(eq_col), vcov(eq_col))
table.col1 <- c(lambda,se_lambda,phi,se_phi,r2,nobs(eq_col),amp,amp_se)

# Col 2
eq_col <- lm(d_ln_emp_per_firm_entrant_41 ~ d_ln_GSP_worker_41 + d_ln_firms_lag_41, data = data_gsp_bds)
summary(eq_col)
coef_prod = summary(eq_col)$coefficients["d_ln_GSP_worker_41", "Estimate"]
coef_firm = summary(eq_col)$coefficients["d_ln_firms_lag_41", "Estimate"]
lambda = round(coef_prod+1, digits= 3)
se_lambda = round(summary(eq_col)$coefficients["d_ln_GSP_worker_41", "Std. Error"], digits = 3)
phi = round(-coef_firm, digits= 3)
se_phi = round(summary(eq_col)$coefficients["d_ln_firms_lag_41", "Std. Error"], digits = 3)
r2 = round(summary(eq_col)$r.squared, digits = 3)
amp = -coef_prod/(1+coef_firm)/(sigma-1+coef_prod/(1+coef_firm))*100
amp_se = deltamethod(~ -x2/(1+x3)/(sigma-1+x2/(1+x3))*100, coef(eq_col), vcov(eq_col))
table.col2 <- c(lambda,se_lambda,phi,se_phi,r2,nobs(eq_col),amp,amp_se)

# Col 3
eq_col <- lm( d_ln_emp_per_firm_entrant_1 ~ d_ln_GSP_worker_1, data = data_gsp_bds)
summary(eq_col)
coef_prod = summary(eq_col)$coefficients["d_ln_GSP_worker_1", "Estimate"]
lambda = round(coef_prod+1, digits= 3)
se_lambda = round(summary(eq_col)$coefficients["d_ln_GSP_worker_1", "Std. Error"], digits = 3)
phi = 0
se_phi = 0
r2 = round(summary(eq_col)$r.squared, digits = 3)
amp = -coef_prod/(sigma-1+coef_prod)*100
amp_se = deltamethod(~ -x2/(sigma-1+x2)*100, coef(eq_col), vcov(eq_col))
table.col3 <- c(lambda,se_lambda,phi,se_phi,r2,nobs(eq_col),amp,amp_se)

# Col 4
eq_col <- lm(d_ln_emp_per_firm_entrant_1 ~ d_ln_GSP_worker_1 + d_ln_firms_lag_1, data = data_gsp_bds)
summary(eq_col)
coef_prod = summary(eq_col)$coefficients["d_ln_GSP_worker_1", "Estimate"]
coef_firm = summary(eq_col)$coefficients["d_ln_firms_lag_1", "Estimate"]
lambda = round(coef_prod+1, digits= 3)
se_lambda = round(summary(eq_col)$coefficients["d_ln_GSP_worker_1", "Std. Error"], digits = 3)
phi = round(-coef_firm, digits= 3)
se_phi = round(summary(eq_col)$coefficients["d_ln_firms_lag_1", "Std. Error"], digits = 3)
r2 = round(summary(eq_col)$r.squared, digits = 3)
amp = -coef_prod/(1+coef_firm)/(sigma-1+coef_prod/(1+coef_firm))*100
amp_se = deltamethod(~ -x2/(1+x3)/(sigma-1+x2/(1+x3))*100, coef(eq_col), vcov(eq_col))
table.col4 <- c(lambda,se_lambda,phi,se_phi,r2,nobs(eq_col),amp,amp_se)

table <- cbind(table.col1, table.col2, table.col3, table.col4)
table <- round(table, digits=3)
rnames <- c("lambdar", "se", "phi", "se", "R2", "N", "amp", "se")
cnames <- c("41 years","41 years lagged N","1 years","1 years lagged N")
table3 <- matrix(table,nrow=8,byrow=FALSE,dimnames=list(rnames,cnames))
View(table3)

# save regression results
write.csv(table2,file=paste(wkdir, "output/tables/table2.csv",  sep="/"), row.names=TRUE)
write.csv(table3,file=paste(wkdir, "output/tables/table3.csv",  sep="/"), row.names=TRUE)




##################### Prepare data for Table A3, A4  #################### 
data_gsp <- fread(paste(wkdir, "raw data/realStateGDP.csv",  sep="/")) %>% 
  rename(real_gsp = rgdp_haver)
data_gsp <- mutate(data_gsp, state_abbrev = toupper(state_abbrev))
data_fips <- fread(paste(wkdir, "raw data/stateFips.csv",  sep="/"))
data_gsp <- merge(data_gsp, data_fips, by.x = "state_abbrev", by.y = "stusps", all.x = TRUE, all.y = TRUE)
data_bds <- fread(paste(wkdir, "raw data/bds2020_st.csv",  sep="/"),  select = c("year", "estabs", "emp",'st'))
data_gsp_bds <- merge(data_gsp, data_bds, by.x = c("st", "year"), by.y = c("st","year"), all.x = FALSE, all.y = TRUE)
data_bds_entrant <- fread(paste(wkdir, "raw data/bds2020_st_ea.csv",  sep="/"),  
                          select = c("year", "st", "eage", "estabs", "emp"))[grep('a) 0', eage)][, c("year", "st", "estabs", "emp")]
data_bds_entrant <- as.data.frame(sapply(data_bds_entrant, as.numeric)) 
data_bds_entrant <- mutate(data_bds_entrant, ln_emp_per_estab_entrant = log(emp / estabs))
data_gsp_bds <- merge(data_gsp_bds, data_bds_entrant[, c("year", "st", "ln_emp_per_estab_entrant")], by.x = c("st","year"), by.y = c("st","year"), all.x = TRUE, all.y = FALSE)
data_gsp_bds <-mutate(data_gsp_bds, ln_GSP_worker = log(real_gsp / emp),
                      ln_emp_per_estab = log(emp / estabs),
                      ln_emp = log(emp),
                      ln_estabs = log(estabs))

data_gsp_bds <- data_gsp_bds[ data_gsp_bds$state_abbrev != "DC",]
data_gsp_bds$ln_estabs_lag <- dplyr::lag(data_gsp_bds$ln_estabs, n=1, order_by = cbind( data_gsp_bds$st, data_gsp_bds$year))
firstyear = min(data_gsp_bds$year)
data_gsp_bds[data_gsp_bds$year==firstyear, c("ln_estabs_lag")]<- NA

# calculate 1 year change in the variables
h = 1
data_gsp_bds$ln_estabs_lag_lagged1 <- dplyr::lag(data_gsp_bds$ln_estabs_lag, n=h, order_by = cbind( data_gsp_bds$st, data_gsp_bds$year))
data_gsp_bds$ln_emp_lagged1 <- dplyr::lag(data_gsp_bds$ln_emp, n=h, order_by = cbind( data_gsp_bds$st, data_gsp_bds$year))
data_gsp_bds$ln_emp_per_estab_lagged1 <- dplyr::lag(data_gsp_bds$ln_emp_per_estab, n=h, order_by = cbind( data_gsp_bds$st, data_gsp_bds$year))
data_gsp_bds$ln_emp_per_estab_entrant_lagged1 <- dplyr::lag(data_gsp_bds$ln_emp_per_estab_entrant, n=h, order_by = cbind( data_gsp_bds$st, data_gsp_bds$year))
data_gsp_bds$ln_GSP_worker_lagged1 <- dplyr::lag(data_gsp_bds$ln_GSP_worker, n=h, order_by = cbind( data_gsp_bds$st, data_gsp_bds$year))
data_gsp_bds[data_gsp_bds$year==firstyear+h-1, c("ln_estabs_lag_lagged1", "ln_emp_lagged1","ln_emp_per_estab_lagged1","ln_emp_per_estab_entrant_lagged1","ln_GSP_worker_lagged1")]<- NA

# 1 year change in the regression variables
data_gsp_bds$d_ln_estabs_lag_1 = data_gsp_bds$ln_estabs_lag - data_gsp_bds$ln_estabs_lag_lagged1
data_gsp_bds$d_ln_emp_1 = data_gsp_bds$ln_emp - data_gsp_bds$ln_emp_lagged1
data_gsp_bds$d_ln_emp_per_estab_1 = data_gsp_bds$ln_emp_per_estab -  data_gsp_bds$ln_emp_per_estab_lagged1 
data_gsp_bds$d_ln_emp_per_estab_entrant_1 = data_gsp_bds$ln_emp_per_estab_entrant -  data_gsp_bds$ln_emp_per_estab_entrant_lagged1 
data_gsp_bds$d_ln_GSP_worker_1 = data_gsp_bds$ln_GSP_worker - data_gsp_bds$ln_GSP_worker_lagged1


# 41 year lag
h = 41
data_gsp_bds$ln_estabs_lag_lagged41 <- dplyr::lag(data_gsp_bds$ln_estabs_lag, n=h, order_by = cbind( data_gsp_bds$st, data_gsp_bds$year))
data_gsp_bds$ln_emp_lagged41 <- dplyr::lag(data_gsp_bds$ln_emp, n=h, order_by = cbind( data_gsp_bds$st, data_gsp_bds$year))
data_gsp_bds$ln_emp_per_estab_lagged41 <- dplyr::lag(data_gsp_bds$ln_emp_per_estab, n=h, order_by = cbind( data_gsp_bds$st, data_gsp_bds$year))
data_gsp_bds$ln_emp_per_estab_entrant_lagged41 <- dplyr::lag(data_gsp_bds$ln_emp_per_estab_entrant, n=h, order_by = cbind( data_gsp_bds$st, data_gsp_bds$year))
data_gsp_bds$ln_GSP_worker_lagged41 <- dplyr::lag(data_gsp_bds$ln_GSP_worker, n=h, order_by = cbind( data_gsp_bds$st, data_gsp_bds$year))
data_gsp_bds[data_gsp_bds$year<=firstyear+h-1, c("ln_estabs_lag_lagged41","ln_emp_lagged41","ln_emp_per_estab_lagged41","ln_emp_per_estab_entrant_lagged41","ln_GSP_worker_lagged41")]<- NA

# 41 year change in the regression variables
data_gsp_bds$d_ln_estabs_lag_41 = data_gsp_bds$ln_estabs_lag - data_gsp_bds$ln_estabs_lag_lagged41
data_gsp_bds$d_ln_emp_41 = data_gsp_bds$ln_emp - data_gsp_bds$ln_emp_lagged41
data_gsp_bds$d_ln_emp_per_estab_41 = data_gsp_bds$ln_emp_per_estab -  data_gsp_bds$ln_emp_per_estab_lagged41 
data_gsp_bds$d_ln_emp_per_estab_entrant_41 = data_gsp_bds$ln_emp_per_estab_entrant -  data_gsp_bds$ln_emp_per_estab_entrant_lagged41 
data_gsp_bds$d_ln_GSP_worker_41 = data_gsp_bds$ln_GSP_worker - data_gsp_bds$ln_GSP_worker_lagged41

View(data_gsp_bds)


#####################  Table A3 #################### 
sigma = 4 
# Col 1
eq_col <- lm( d_ln_emp_per_estab_41 ~ d_ln_GSP_worker_41, data = data_gsp_bds)
summary(eq_col)
coef_prod = summary(eq_col)$coefficients["d_ln_GSP_worker_41", "Estimate"]
lambda = round(coef_prod+1, digits= 3)
se_lambda = round(summary(eq_col)$coefficients["d_ln_GSP_worker_41", "Std. Error"], digits = 3)
phi = 0
se_phi = 0
r2 = round(summary(eq_col)$r.squared, digits = 3)
amp = -coef_prod/(sigma-1+coef_prod)*100
amp_se = deltamethod(~ -x2/(sigma-1+x2)*100, coef(eq_col), vcov(eq_col))
table.col1 <- c(lambda,se_lambda,phi,se_phi,r2,nobs(eq_col),amp,amp_se)

# Col 2
eq_col <- lm(d_ln_emp_per_estab_41 ~ d_ln_GSP_worker_41 + d_ln_estabs_lag_41, data = data_gsp_bds)
summary(eq_col)
coef_prod = summary(eq_col)$coefficients["d_ln_GSP_worker_41", "Estimate"]
coef_estab = summary(eq_col)$coefficients["d_ln_estabs_lag_41", "Estimate"]
lambda = round(coef_prod+1, digits= 3)
se_lambda = round(summary(eq_col)$coefficients["d_ln_GSP_worker_41", "Std. Error"], digits = 3)
phi = round(-coef_estab, digits= 3)
se_phi = round(summary(eq_col)$coefficients["d_ln_estabs_lag_41", "Std. Error"], digits = 3)
r2 = round(summary(eq_col)$r.squared, digits = 3)
amp = -coef_prod/(1+coef_estab)/(sigma-1+coef_prod/(1+coef_estab))*100
amp_se = deltamethod(~ -x2/(1+x3)/(sigma-1+x2/(1+x3))*100, coef(eq_col), vcov(eq_col))
table.col2 <- c(lambda,se_lambda,phi,se_phi,r2,nobs(eq_col),amp,amp_se)

# Col 3
eq_col <- lm( d_ln_emp_per_estab_1 ~ d_ln_GSP_worker_1, data = data_gsp_bds)
summary(eq_col)
coef_prod = summary(eq_col)$coefficients["d_ln_GSP_worker_1", "Estimate"]
lambda = round(coef_prod+1, digits= 3)
se_lambda = round(summary(eq_col)$coefficients["d_ln_GSP_worker_1", "Std. Error"], digits = 3)
phi = 0
se_phi = 0
r2 = round(summary(eq_col)$r.squared, digits = 3)
amp = -coef_prod/(sigma-1+coef_prod)*100
amp_se = deltamethod(~ -x2/(sigma-1+x2)*100, coef(eq_col), vcov(eq_col))
table.col3 <- c(lambda,se_lambda,phi,se_phi,r2,nobs(eq_col),amp,amp_se)

# Col 4
eq_col <- lm(d_ln_emp_per_estab_1 ~ d_ln_GSP_worker_1 + d_ln_estabs_lag_1, data = data_gsp_bds)
summary(eq_col)
coef_prod = summary(eq_col)$coefficients["d_ln_GSP_worker_1", "Estimate"]
coef_estab = summary(eq_col)$coefficients["d_ln_estabs_lag_1", "Estimate"]
lambda = round(coef_prod+1, digits= 3)
se_lambda = round(summary(eq_col)$coefficients["d_ln_GSP_worker_1", "Std. Error"], digits = 3)
phi = round(-coef_estab, digits= 3)
se_phi = round(summary(eq_col)$coefficients["d_ln_estabs_lag_1", "Std. Error"], digits = 3)
r2 = round(summary(eq_col)$r.squared, digits = 3)
amp = -coef_prod/(1+coef_estab)/(sigma-1+coef_prod/(1+coef_estab))*100
amp_se = deltamethod(~ -x2/(1+x3)/(sigma-1+x2/(1+x3))*100, coef(eq_col), vcov(eq_col))
table.col4 <- c(lambda,se_lambda,phi,se_phi,r2,nobs(eq_col),amp,amp_se)

table <- cbind(table.col1, table.col2, table.col3, table.col4)
table <- round(table, digits=3)
rnames <- c("lambdar", "se", "phi", "se", "R2", "N", "amp", "se")
cnames <- c("41 years","41 years lagged N","1 years","1 years lagged N")
tableA3 <- matrix(table,nrow=8,byrow=FALSE,dimnames=list(rnames,cnames))
View(tableA3)

#####################  Table A4 #################### 
sigma = 4 
# Col 1
eq_col <- lm( d_ln_emp_per_estab_entrant_41 ~ d_ln_GSP_worker_41, data = data_gsp_bds)
summary(eq_col)
coef_prod = summary(eq_col)$coefficients["d_ln_GSP_worker_41", "Estimate"]
lambda = round(coef_prod+1, digits= 3)
se_lambda = round(summary(eq_col)$coefficients["d_ln_GSP_worker_41", "Std. Error"], digits = 3)
phi = 0
se_phi = 0
r2 = round(summary(eq_col)$r.squared, digits = 3)
amp = -coef_prod/(sigma-1+coef_prod)*100
amp_se = deltamethod(~ -x2/(sigma-1+x2)*100, coef(eq_col), vcov(eq_col))
table.col1 <- c(lambda,se_lambda,phi,se_phi,r2,nobs(eq_col),amp,amp_se)

# Col 2
eq_col <- lm(d_ln_emp_per_estab_entrant_41 ~ d_ln_GSP_worker_41 + d_ln_estabs_lag_41, data = data_gsp_bds)
summary(eq_col)
coef_prod = summary(eq_col)$coefficients["d_ln_GSP_worker_41", "Estimate"]
coef_estab = summary(eq_col)$coefficients["d_ln_estabs_lag_41", "Estimate"]
lambda = round(coef_prod+1, digits= 3)
se_lambda = round(summary(eq_col)$coefficients["d_ln_GSP_worker_41", "Std. Error"], digits = 3)
phi = round(-coef_estab, digits= 3)
se_phi = round(summary(eq_col)$coefficients["d_ln_estabs_lag_41", "Std. Error"], digits = 3)
r2 = round(summary(eq_col)$r.squared, digits = 3)
amp = -coef_prod/(1+coef_estab)/(sigma-1+coef_prod/(1+coef_estab))*100
amp_se = deltamethod(~ -x2/(1+x3)/(sigma-1+x2/(1+x3))*100, coef(eq_col), vcov(eq_col))
table.col2 <- c(lambda,se_lambda,phi,se_phi,r2,nobs(eq_col),amp,amp_se)

# Col 3
eq_col <- lm( d_ln_emp_per_estab_entrant_1 ~ d_ln_GSP_worker_1, data = data_gsp_bds)
summary(eq_col)
coef_prod = summary(eq_col)$coefficients["d_ln_GSP_worker_1", "Estimate"]
lambda = round(coef_prod+1, digits= 3)
se_lambda = round(summary(eq_col)$coefficients["d_ln_GSP_worker_1", "Std. Error"], digits = 3)
phi = 0
se_phi = 0
r2 = round(summary(eq_col)$r.squared, digits = 3)
amp = -coef_prod/(sigma-1+coef_prod)*100
amp_se = deltamethod(~ -x2/(sigma-1+x2)*100, coef(eq_col), vcov(eq_col))
table.col3 <- c(lambda,se_lambda,phi,se_phi,r2,nobs(eq_col),amp,amp_se)

# Col 4
eq_col <- lm(d_ln_emp_per_estab_entrant_1 ~ d_ln_GSP_worker_1 + d_ln_estabs_lag_1, data = data_gsp_bds)
summary(eq_col)
coef_prod = summary(eq_col)$coefficients["d_ln_GSP_worker_1", "Estimate"]
coef_estab = summary(eq_col)$coefficients["d_ln_estabs_lag_1", "Estimate"]
lambda = round(coef_prod+1, digits= 3)
se_lambda = round(summary(eq_col)$coefficients["d_ln_GSP_worker_1", "Std. Error"], digits = 3)
phi = round(-coef_estab, digits= 3)
se_phi = round(summary(eq_col)$coefficients["d_ln_estabs_lag_1", "Std. Error"], digits = 3)
r2 = round(summary(eq_col)$r.squared, digits = 3)
amp = -coef_prod/(1+coef_estab)/(sigma-1+coef_prod/(1+coef_estab))*100
amp_se = deltamethod(~ -x2/(1+x3)/(sigma-1+x2/(1+x3))*100, coef(eq_col), vcov(eq_col))
table.col4 <- c(lambda,se_lambda,phi,se_phi,r2,nobs(eq_col),amp,amp_se)

table <- cbind(table.col1, table.col2, table.col3, table.col4)
table <- round(table, digits=3)
rnames <- c("lambdar", "se", "phi", "se", "R2", "N", "amp", "se")
cnames <- c("41 years","41 years lagged N","1 years","1 years lagged N")
tableA4 <- matrix(table,nrow=8,byrow=FALSE,dimnames=list(rnames,cnames))
View(tableA4)

# save regression results
write.csv(tableA3,file=paste(wkdir, "output/tables/tableA3.csv",  sep="/"), row.names=TRUE)
write.csv(tableA4,file=paste(wkdir, "output/tables/tableA4.csv",  sep="/"), row.names=TRUE)


